Detailing common methods and preprocessing steps for regime shift detection methods

Published

January 5, 2025

Code
library(here)
library(tidyverse)
library(rshift)
library(gmRi)
library(mgcv)
library(EnvCpt)
library(showtext)
library(patchwork)

# Make sure lag is done correctly
conflicted::conflict_prefer("lag", "stats")
conflicted::conflict_prefer("filter", "dplyr")

# Path to timeseries
lob_ecol_path <- cs_path("mills", "Projects/Lobster ECOL")

# Load and add inshore/offshore labels on the daily data
lobecol_fvcom <- read_csv(
  str_c(lob_ecol_path, "FVCOM_processed/area_timeseries/all_regions_fvcom_temperatures_daily.csv")) %>% 
  mutate(depth_type = if_else(area_id %in% c("GOM_GBK", "SNE"), "offshore", "nearshore"))

theme_set(theme_gmri())
Code
# Use GMRI style for fonts
use_gmri_style_rmd()

STARS Regime Shifts - LobECOL Areas

{rstars} Implementation

The following results extend the detailed approach of Stirnimann et al. 2019’s {rstars} repository.

Code
# Stirnimann used these values in their paper:
# l = 5,10,15,17.5 years, with monthly data
# Huber = 1
# Subsampling = (l + 1)/3


# Load the function(s)
source(here::here("rstars-master","rSTARS.R"))

Data Pre-Processing Routines

For our results we’ve expanded on their approach with two additional pre-procesing steps recommended by Rodionov (pretty sure) & in the recent review by Lund et al. 2023’s.

These two additional steps include removing the seasonal cycle using a GAM and a cyclic cubic regression spline to model the seasonal cycle.

The other additional pre-processing step is the removal of the long-term trend. To illustrate these pre-processing steps, a daily bottome temperature timeseries for Long Island Sound will be used below:

Code
# Test timeseries
test_area <- "Long Island Sound"
tester <- filter(lobecol_fvcom, area_id == test_area)

# Plot Surface and Bottom

ggplot(tester, aes(time)) +
  geom_line(aes(y = surface_t, color = "Surface Temperature"), alpha = 0.8) +
  geom_line(aes(y = bottom_t, color = "Bottom Temperature"), alpha = 0.8) + 
  scale_color_gmri() +
  labs(y = "Temperature", title = "Long Island Sound FVCOM Temperatures")

Removing Seasonalilty

As expected, ocean temperatures in Long Island Sound follow a prnonounced seasonal cycle. We can model this periodicity to get a sense of the average temperature based on the day of the year.

Code
# Make a day of year value to cycle over
tester <- tester %>% mutate(yday = lubridate::yday(time))

# Model a linear trend and the seasonal cycle
season_only_gam <- gam(
  bottom_t ~ s(yday, bs = "cc"), 
  data = tester,
  method = "REML")

# Add the predictions and residuals
season_test <- broom::augment(x = season_only_gam) %>% 
  rename(seasonal_fit = .fitted,
         seasonal_resid = .resid) %>% 
  distinct(yday, seasonal_fit)

Seasonal temperatures are at their lowest during February-March, and they peak during July-August. Each individual year will vary, but the seasonal period is quite consistent.

Code
# Plot what that looke like on an annual time scale
ggplot(tester, aes(x = yday)) +
  geom_line(
    aes(y = bottom_t, color = "Single Year Series", group = year(time)), 
    linewidth = 0.5, alpha = 0.75) +
  geom_line(
    data = season_test,
    aes(y = seasonal_fit, color = "Mean Seasonal Cycle"), 
    alpha = 0.75, linewidth = 2) +
  scale_color_grey() +
  labs(color = "Data",
       y = "Bottom Temperature",
       x = "Day of Year",
       title = "Long Island Sound Seasonality")

Code
# # Plot them both?
# tester %>% 
#   left_join(season_test, by = join_by("yday")) %>% 
#   ggplot(aes(x = time)) +
#   geom_line(aes(y = bottom_t, color = "Daily Temperatures"), alpha = 0.5, linewidth = 1) +
#   geom_line(aes(y = seasonal_fit, color = "Seasonal Means"), alpha = 0.5, linewidth = 0.5) +
#   scale_color_gmri() +
#   labs(y = "Bottom Temperature", color = "Data")


tester %>%
  left_join(season_test, by = join_by("yday")) %>% 
  mutate(seasonal_resid = bottom_t - seasonal_fit)  %>% 
  ggplot(aes(time, seasonal_resid)) +
  geom_line( alpha = 0.75) +
  geom_hline(yintercept = 0, aes(color = "Seasonal Mean Temeprature"), linewidth = 2, alpha = 0.75) +
  labs(y = "Seasonal Temperature Anomaly",
       title = "Removal of Seasonal Period")

Running STARS with Full Routine

The last step is to run it through the RSTARS algorithm and specify a few more arguments for how to treat the timeseries.

We need to set arguments for regime length, outlier weighting, and for our regime probability cutoff. There are also options for prewhitening methods which help address autocorrelation in the timeseries.

Code
# Run it with

# Stirnimann used these values in their paper:
# l = 5,10,15,17.5 years, with monthly data
# Huber = 1
# Subsampling = (l + 1)/3

# He also recommended using prewhitening, and recommended MPK and IP4

# Or do the prewhitening on the detrended data?
lis_rstars <- rstars(
  data.timeseries = as.data.frame(season_trend_results[,c("time", "seasonal_resid")]), 
  l.cutoff = 365*7, 
  pValue = 0.05, 
  Huber = 1, 
  Endfunction = T,    # Some behavior at the end of the timeseries
  preWhitening = T,   # Apply prewhitening T/F
  OLS = F,            # OLSprewhitening method T/F
  MPK = T,            # Marriott-Pope + Kennedy prewhitening method T/F
  IP4 = F,            # IP4 prewhitening method T/F
  SubsampleSize = (365*7 + 1) / 3, # subsampling rate for hubers + prewhitening
  returnResults = T) # Return the results as a dataframe

  


# Results now look like this:
ggplot(lis_rstars) +
  geom_line(
    aes(time, seasonal_resid), 
    linewidth = 0.5, alpha = 0.5) +
  geom_line(aes(time, regime_mu), linewidth = 1) +
  geom_vline(
    data = filter(lis_rstars, RSI != 0),
    aes(xintercept = time), color = "red", linewidth = 1) +
  scale_y_continuous(limits = c(-0.35, 0.35)) +
  labs(x = "", 
       y = "Detrended Bottom Temp Anomaly",
       title = "STARS Results - After full pre-processing steps",
       subtitle = test_area)

Daily Bottom Temperature RSI

If we apply the above steps to each daily bottom temperature timeseries we can return the following results:

Code
# # Run them all
# bot_temp_shifts <- lobecol_fvcom %>%
#   mutate(yday = lubridate::yday(time)) %>% 
#   split(.$area_id) %>%
#   map_dfr(function(.x){
#     
#     # Fit the model to detrend and remove seasons
#     # Model a linear trend and the seasonal cycle
#     season_trend_model <- gam(
#       bottom_t ~ s(yday, bs = "cc") + time, 
#       data = .x,
#       method = "REML")
#     
#     # save the results
#     preprocessed_results <- broom::augment(x = season_trend_model) %>% 
#       rename(
#         model_fit = .fitted,
#         model_resid = .resid) 
#     
#     
#     # Get the results from that
#     x_rstars <- rstars(
#       data.timeseries = as.data.frame(preprocessed_results[,c("time", "model_resid")]), 
#       l.cutoff = 365*7,
#       pValue = 0.05,
#       Huber = 1,
#       Endfunction = T,
#       preWhitening = T,
#       OLS = F,
#       MPK = T,
#       IP4 = F,
#       SubsampleSize = (365*7 + 1)/3,
#       returnResults = T)
#     
#     return(x_rstars)
# 
#     }, .id = "area_id")
# 
# 
# # Save it
# write_csv(bot_temp_shifts, here::here("rstars_results/lobecol_btemp_shifts_detrended.csv"))
Code
# Load that data again
bot_temp_shifts <- read_csv(here::here("rstars_results/lobecol_btemp_shifts_detrended.csv"))


# Plot the RSI
bot_temp_shifts %>% 
 ggplot() +
  geom_line(aes(time, RSI), linewidth = 0.5, alpha = 0.35) +
  geom_hline(yintercept = 0) +
  facet_grid() +
  labs(
    x = "Time",
    y = "RSI",
    title = "All Areas, Bottom Temperature RSI")

Code
ggplot(bot_temp_shifts) +
  geom_line(aes(time, regime_mu_pw), 
            linewidth = 0.5, alpha = 0.5) +
  geom_line(aes(time, regime_mu), linewidth = 1) +
  geom_vline(
    data = filter(bot_temp_shifts, RSI != 0),
    aes(xintercept = time), color = "red", linewidth = 1) +
  scale_y_continuous(limits = c(-0.35, 0.35)) +
  labs(x = "", 
       y = "Detrended Bottom Temp Anomaly",
       title = "Daily BT STARS Results",
       caption = "Full prewhitening+detrending routine",
       subtitle = test_area)

Daily Surface Temperature RSI

Repeating the process for daily sea surface temperatures gives us these RSI results:

Code
# # Run them all
# surf_temp_shifts <- lobecol_fvcom %>%
#   mutate(yday = lubridate::yday(time)) %>% 
#   split(.$area_id) %>%
#   map_dfr(function(.x){
#     
#     # Fit the model to detrend and remove seasons
#     # Model a linear trend and the seasonal cycle
#     season_trend_model <- gam(
#       surface_t ~ s(yday, bs = "cc") + time, 
#       data = .x,
#       method = "REML")
#     
#     # save the results
#     preprocessed_results <- broom::augment(x = season_trend_model) %>% 
#       rename(
#         model_fit = .fitted,
#         model_resid = .resid) 
#     
#     
#     # Get the results from that
#     x_rstars <- rstars(
#       data.timeseries = as.data.frame(preprocessed_results[,c("time", "model_resid")]), 
#       l.cutoff = 365*7,
#       pValue = 0.05,
#       Huber = 1,
#       Endfunction = T,
#       preWhitening = T,
#       OLS = F,
#       MPK = T,
#       IP4 = F,
#       SubsampleSize = (365*7 + 1)/3,
#       returnResults = T)
#     
#     return(x_rstars)
# 
#     }, .id = "area_id")
# 
# 
# # Save it
# write_csv(surf_temp_shifts, here::here("rstars_results/lobecol_stemp_shifts_detrended.csv"))
Code
# Load that data again
surf_temp_shifts <- read_csv(here::here("rstars_results/lobecol_stemp_shifts_detrended.csv"))


# Plot the RSI
surf_temp_shifts %>% 
 ggplot() +
  geom_line(aes(time, RSI), linewidth = 0.5, alpha = 0.35) +
  geom_hline(yintercept = 0) +
  facet_grid() +
  labs(
    x = "Time",
    y = "RSI",
    title = "All Areas, Surface Temperature RSI")

Code
ggplot(surf_temp_shifts) +
  geom_line(aes(time, regime_mu_pw), 
            linewidth = 0.5, alpha = 0.5) +
  geom_line(aes(time, regime_mu), linewidth = 1) +
  geom_vline(
    data = filter(surf_temp_shifts, RSI != 0),
    aes(xintercept = time), color = "red", linewidth = 1) +
  scale_y_continuous(limits = c(-0.35, 0.35)) +
  labs(x = "", 
       y = "Detrended Bottom Temp Anomaly",
       title = "Daily SST STARS Results",
       caption = "Full prewhitening+detrending routine",
       subtitle = test_area)

Code
surf_temp_shifts %>% 
  mutate(RSI = if_else(RSI == 0, NA, RSI),
         RSI_col = if_else(RSI>0, "orange", "royalblue")) %>% 
  ggplot(aes(x = time, y = area_id)) +
  geom_point(aes(size = abs(RSI), color = I(RSI_col))) +
  labs(title = "Daily Surface Temperature RSI")

Code
bot_temp_shifts %>% 
  mutate(RSI = if_else(RSI == 0, NA, RSI),
         RSI_col = if_else(RSI>0, "orange", "royalblue")) %>% 
  ggplot(aes(x = time, y = area_id)) +
  geom_point(aes(size = abs(RSI), color = I(RSI_col)))  +
  theme(axis.text.y = element_blank(),
        legend.position = "none") +
  labs(title = "Daily Bottom Temperature RSI")

Processing Monthly Means

Other than temperature we have very few daily environmental timeseries. We also are less interested in the time within a year that changes ocurred, and more interested in the general year/month that changes are ocurring in.

For these reasons and for consistency with the other environmental drivers, I will repeat these steps using monthly mean SSt & BT.

Monthly Temperature RSI

Code
# Take the input data,
# Get monthly averages
# remove trend and long-term monthly means
# run STARS routine again


# Run the monthly versions

# # Run them all
# surf_temp_monthly_shifts <- lobecol_fvcom %>%
#   mutate(
#     year = lubridate::year(time),
#     month = lubridate::month(time)) %>%
#   split(.$area_id) %>%
#   map_dfr(function(.x){
#     
#     # Make it monthly
#     .x_monthly <- .x %>% 
#       group_by(year, month) %>% 
#       summarise(
#         surface_t = mean(surface_t, na.rm = T),
#         bottom_t  = mean(bottom_t, na.rm = T),
#         .groups = "drop") %>% 
#       mutate(month = factor(month),
#              yr_num = as.numeric(as.character(year)))
#     
# 
#     # Fit the model to detrend and remove seasons
#     # Model a linear trend and the seasonal cycle
#     season_trend_model <- gam(
#       surface_t ~ month + yr_num,
#       data = .x_monthly,
#       method = "REML")
# 
#     # save the results
#     preprocessed_results <- broom::augment(x = season_trend_model) %>%
#       rename(
#         model_fit = .fitted,
#         model_resid = .resid) %>% 
#       # Rebuild a "date" column to pass to the function
#       mutate(time = as.Date(
#         str_c(
#           yr_num, 
#           str_pad(month, side = "left", pad = "0", width = 2),
#           "15", sep = "-")))
#     #return(preprocessed_results) #check 
# 
# 
#     # Get the results from that
#     x_rstars <- rstars(
#       data.timeseries = as.data.frame(
#         preprocessed_results[,c("time", "model_resid")]),
#       l.cutoff = 12*7,
#       pValue = 0.05,
#       Huber = 1,
#       Endfunction = T,
#       preWhitening = T,
#       OLS = F,
#       MPK = T,
#       IP4 = F,
#       SubsampleSize = (12*7 + 1)/3,
#       returnResults = T)
# 
#     return(x_rstars)
# 
#     }, .id = "area_id")
# 
# 
# 
# #Bottom temperature
# bot_temp_monthly_shifts <- lobecol_fvcom %>%
#   mutate(
#     year = lubridate::year(time),
#     month = lubridate::month(time)) %>%
#   split(.$area_id) %>%
#   map_dfr(function(.x){
#     
#     # Make it monthly
#     .x_monthly <- .x %>% 
#       group_by(year, month) %>% 
#       summarise(
#         surface_t = mean(surface_t, na.rm = T),
#         bottom_t  = mean(bottom_t, na.rm = T),
#         .groups = "drop") %>% 
#       mutate(month = factor(month),
#              yr_num = as.numeric(as.character(year)))
#     
# 
#     # Fit the model to detrend and remove seasons
#     # Model a linear trend and the seasonal cycle
#     season_trend_model <- gam(
#       bottom_t ~ month + yr_num,
#       data = .x_monthly,
#       method = "REML")
# 
#     # save the results
#     preprocessed_results <- broom::augment(x = season_trend_model) %>%
#       rename(
#         model_fit = .fitted,
#         model_resid = .resid) %>% 
#       # Rebuild a "date" column to pass to the function
#       mutate(time = as.Date(
#         str_c(
#           yr_num, 
#           str_pad(month, side = "left", pad = "0", width = 2),
#           "15", sep = "-")))
#     #return(preprocessed_results) #check 
# 
# 
#     # Get the results from that
#     x_rstars <- rstars(
#       data.timeseries = as.data.frame(
#         preprocessed_results[,c("time", "model_resid")]),
#       l.cutoff = 12*7,
#       pValue = 0.05,
#       Huber = 1,
#       Endfunction = T,
#       preWhitening = T,
#       OLS = F,
#       MPK = T,
#       IP4 = F,
#       SubsampleSize = (12*7 + 1)/3,
#       returnResults = T)
# 
#     return(x_rstars)
# 
#     }, .id = "area_id")
# 
# 
# 
# 
# 
# # Save them
# write_csv(surf_temp_monthly_shifts, here::here("rstars_results/lobecol_stemp_monthly_shifts_detrended.csv"))
# write_csv(bot_temp_monthly_shifts, here::here("rstars_results/lobecol_btemp_monthly_shifts_detrended.csv"))
Code
# Load that data again
surf_temp_monthly_shifts <- read_csv(here::here("rstars_results/lobecol_stemp_monthly_shifts_detrended.csv"))


# Plot the RSI for both
surf_temp_monthly_shifts %>% 
 ggplot() +
  geom_line(aes(time, RSI), linewidth = 0.5, alpha = 0.35) +
  geom_hline(yintercept = 0) +
  facet_grid() +
  labs(
    x = "Time",
    y = "RSI",
    title = "Monthly Surface Temperature RSI")

Code
bot_temp_monthly_shifts <- read_csv(here::here("rstars_results/lobecol_btemp_monthly_shifts_detrended.csv"))



# Plot the RSI for both
bot_temp_monthly_shifts %>% 
 ggplot() +
  geom_line(aes(time, RSI), linewidth = 0.5, alpha = 0.35) +
  geom_hline(yintercept = 0) +
  facet_grid() +
  labs(
    x = "Time",
    y = "RSI",
    title = "Monthly Bottom Temperature RSI")

Code
surf_temp_monthly_shifts %>% 
  mutate(RSI = if_else(RSI == 0, NA, RSI),
         RSI_col = if_else(RSI>0, "orange", "royalblue")) %>% 
  ggplot(aes(x = time, y = area_id)) +
  geom_point(aes(size = abs(RSI), color = I(RSI_col))) +
  labs(title = "Monthly Surface Temperature RSI")

Code
bot_temp_monthly_shifts %>% 
  mutate(RSI = if_else(RSI == 0, NA, RSI),
         RSI_col = if_else(RSI>0, "orange", "royalblue")) %>% 
  ggplot(aes(x = time, y = area_id)) +
  geom_point(aes(size = abs(RSI), color = I(RSI_col)))  +
  theme(axis.text.y = element_blank(),
        legend.position = "none") +
  labs(title = "Monthly Bottom Temperature RSI")

Other Environmental Features

In addition to temperature, we have the additional environmental covariates as well.